Instal relavent libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
library(ggcorrplot)
## Loading required package: ggplot2
library(caret)
## Loading required package: lattice
library(tidyr)
library(ggplot2)
library(stringr)
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.3.3
Load 2014 CSV file for Analysis
data_2014 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2014.csv")
str(data_2014)
## 'data.frame': 42727 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 6830 128 8 168 85 352 100 59 23 5 ...
## $ Tot_Suplrs : int 8928 222 8 246 107 447 145 91 33 7 ...
## $ Tot_Suplr_Benes : int 12351 305 NA 293 177 504 163 97 43 NA ...
## $ Tot_Suplr_Clms : int 54724 1332 27 1151 843 2246 749 355 156 19 ...
## $ Tot_Suplr_Srvcs : int 5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 10.8 10.3 10.7 10.9 10.5 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.78 2.8 2.8 2.8 2.79 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.13 2.07 2.16 2.17 2.16 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.13 2.07 2.16 2.17 2.16 ...
# Convert empty strings or whitespace to NA
data_2014 <- data_2014 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2014))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1841 10
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4810
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2014$Tot_Suplr_Benes)) / nrow(data_2014) * 100
missing_percentage
## [1] 11.25752
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2014$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2014) * 100
missing_percentage
## [1] 4.308751
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2014$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2014) * 100
missing_percentage
## [1] 0.0234044
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2014 <- data_2014[!is.na(data_2014$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2014 <- data_2014 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2014$Tot_Suplr_Benes <- as.integer(data_2014$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2014$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2014$Rfrg_Prvdr_Geo_Cd), "National", data_2014$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2014))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2014)
## 'data.frame': 42717 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 6830 128 8 168 85 352 100 59 23 5 ...
## $ Tot_Suplrs : int 8928 222 8 246 107 447 145 91 33 7 ...
## $ Tot_Suplr_Benes : int 12351 305 86 293 177 504 163 97 43 86 ...
## $ Tot_Suplr_Clms : int 54724 1332 27 1151 843 2246 749 355 156 19 ...
## $ Tot_Suplr_Srvcs : int 5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 10.8 10.3 10.7 10.9 10.5 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.78 2.8 2.8 2.8 2.79 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.13 2.07 2.16 2.17 2.16 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.13 2.07 2.16 2.17 2.16 ...
Load 2015 CSV file for Analysis
data_2015 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2015.csv")
str(data_2015)
## 'data.frame': 42372 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 7725 145 15 206 101 417 121 77 27 6 ...
## $ Tot_Suplrs : int 10048 258 17 301 129 519 165 103 40 10 ...
## $ Tot_Suplr_Benes : int 14759 373 17 377 209 620 194 108 59 NA ...
## $ Tot_Suplr_Clms : int 62771 1557 67 1433 960 2463 797 431 205 23 ...
## $ Tot_Suplr_Srvcs : int 6275925 172090 4880 140880 94520 213915 69950 53685 21130 1450 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 12.3 11.7 13.5 12.7 11.7 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.79 2.8 2.8 2.8 2.8 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.14 2.11 2.14 2.17 2.15 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.14 2.11 2.14 2.17 2.15 ...
# Convert empty strings or whitespace to NA
data_2015 <- data_2015 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2015))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1808 8
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4920
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2015$Tot_Suplr_Benes)) / nrow(data_2015) * 100
missing_percentage
## [1] 11.61144
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2015$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2015) * 100
missing_percentage
## [1] 4.266969
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2015$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2015) * 100
missing_percentage
## [1] 0.01888039
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2015 <- data_2015[!is.na(data_2015$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2015 <- data_2015 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2015$Tot_Suplr_Benes <- as.integer(data_2015$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2015$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2015$Rfrg_Prvdr_Geo_Cd), "National", data_2015$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2015))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2015)
## 'data.frame': 42364 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 7725 145 15 206 101 417 121 77 27 6 ...
## $ Tot_Suplrs : int 10048 258 17 301 129 519 165 103 40 10 ...
## $ Tot_Suplr_Benes : int 14759 373 17 377 209 620 194 108 59 89 ...
## $ Tot_Suplr_Clms : int 62771 1557 67 1433 960 2463 797 431 205 23 ...
## $ Tot_Suplr_Srvcs : int 6275925 172090 4880 140880 94520 213915 69950 53685 21130 1450 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 12.3 11.7 13.5 12.7 11.7 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.79 2.8 2.8 2.8 2.8 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.14 2.11 2.14 2.17 2.15 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.14 2.11 2.14 2.17 2.15 ...
Load 2016 CSV file for Analysis
data_2016 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2016.csv")
str(data_2016)
## 'data.frame': 42259 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 8346 145 13 215 102 451 146 85 29 11 ...
## $ Tot_Suplrs : int 11338 280 14 317 149 591 221 125 51 17 ...
## $ Tot_Suplr_Benes : int 18408 450 17 448 261 739 263 133 78 15 ...
## $ Tot_Suplr_Clms : int 82003 2053 66 1784 1266 3299 1078 477 297 41 ...
## $ Tot_Suplr_Srvcs : int 8247311 217440 6130 176770 115590 284860 100410 55750 32130 5040 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 14.2 13.8 15.3 14.7 14 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.8 2.8 2.8 2.84 2.8 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.16 2.13 2.13 2.2 2.15 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.16 2.13 2.13 2.2 2.15 ...
# Convert empty strings or whitespace to NA
data_2016 <- data_2016 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2016))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1791 6
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4840
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2016$Tot_Suplr_Benes)) / nrow(data_2016) * 100
missing_percentage
## [1] 11.45318
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2016$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2016) * 100
missing_percentage
## [1] 4.23815
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2016$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2016) * 100
missing_percentage
## [1] 0.01419816
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2016 <- data_2016[!is.na(data_2016$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2016 <- data_2016 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2016$Tot_Suplr_Benes <- as.integer(data_2016$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2016$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2016$Rfrg_Prvdr_Geo_Cd), "National", data_2016$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2016))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2016)
## 'data.frame': 42253 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 8346 145 13 215 102 451 146 85 29 11 ...
## $ Tot_Suplrs : int 11338 280 14 317 149 591 221 125 51 17 ...
## $ Tot_Suplr_Benes : int 18408 450 17 448 261 739 263 133 78 15 ...
## $ Tot_Suplr_Clms : int 82003 2053 66 1784 1266 3299 1078 477 297 41 ...
## $ Tot_Suplr_Srvcs : int 8247311 217440 6130 176770 115590 284860 100410 55750 32130 5040 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 14.2 13.8 15.3 14.7 14 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.8 2.8 2.8 2.84 2.8 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.16 2.13 2.13 2.2 2.15 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.16 2.13 2.13 2.2 2.15 ...
Load 2017 CSV file for Analysis
data_2017 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2017.csv")
str(data_2017)
## 'data.frame': 42127 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 9621 164 22 251 118 540 145 102 33 14 ...
## $ Tot_Suplrs : int 12571 307 23 379 171 725 234 138 62 21 ...
## $ Tot_Suplr_Benes : int 21648 495 27 545 290 938 294 170 103 21 ...
## $ Tot_Suplr_Clms : int 97398 2211 87 2117 1573 4158 1284 626 335 69 ...
## $ Tot_Suplr_Srvcs : int 9867276 233770 8240 205464 140650 358590 113030 89822 44740 7680 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.3 14.8 16.4 16 14.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 9.61 9.47 9.75 9.73 9.55 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 7.4 7.14 7.6 7.54 7.21 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 7.4 7.14 7.6 7.54 7.21 ...
# Convert empty strings or whitespace to NA
data_2017 <- data_2017 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2017))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1807 38
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4842
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2017$Tot_Suplr_Benes)) / nrow(data_2017) * 100
missing_percentage
## [1] 11.49382
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2017$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2017) * 100
missing_percentage
## [1] 4.289411
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2017$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2017) * 100
missing_percentage
## [1] 0.09020343
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2017 <- data_2017[!is.na(data_2017$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2017 <- data_2017 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2017$Tot_Suplr_Benes <- as.integer(data_2017$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2017$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2017$Rfrg_Prvdr_Geo_Cd), "National", data_2017$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2017))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2017)
## 'data.frame': 42089 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 9621 164 22 251 118 540 145 102 33 14 ...
## $ Tot_Suplrs : int 12571 307 23 379 171 725 234 138 62 21 ...
## $ Tot_Suplr_Benes : int 21648 495 27 545 290 938 294 170 103 21 ...
## $ Tot_Suplr_Clms : int 97398 2211 87 2117 1573 4158 1284 626 335 69 ...
## $ Tot_Suplr_Srvcs : int 9867276 233770 8240 205464 140650 358590 113030 89822 44740 7680 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.3 14.8 16.4 16 14.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 9.61 9.47 9.75 9.73 9.55 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 7.4 7.14 7.6 7.54 7.21 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 7.4 7.14 7.6 7.54 7.21 ...
Load 2018 CSV file for Analysis
data_2018 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2018.csv")
str(data_2018)
## 'data.frame': 42397 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 10626 173 23 269 111 612 190 108 31 16 ...
## $ Tot_Suplrs : int 14509 346 24 409 194 879 292 155 65 30 ...
## $ Tot_Suplr_Benes : int 26102 591 30 630 353 1202 414 197 113 30 ...
## $ Tot_Suplr_Clms : int 113148 2624 96 2425 1809 4985 1617 711 357 95 ...
## $ Tot_Suplr_Srvcs : int 11902027 290663 9620 249205 167920 461317 146890 90050 57000 13000 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.6 14.6 17.5 16.3 15.3 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.5 10 10.6 10.7 10.4 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 8.12 7.65 8.31 8.3 7.99 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 8.12 7.65 8.31 8.3 7.99 ...
# Convert empty strings or whitespace to NA
data_2018 <- data_2018 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2018))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1793 29
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4667
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2018$Tot_Suplr_Benes)) / nrow(data_2018) * 100
missing_percentage
## [1] 11.00785
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2018$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2018) * 100
missing_percentage
## [1] 4.229073
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2018$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2018) * 100
missing_percentage
## [1] 0.06840107
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2018 <- data_2018[!is.na(data_2018$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2018 <- data_2018 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2018$Tot_Suplr_Benes <- as.integer(data_2018$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2018$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2018$Rfrg_Prvdr_Geo_Cd), "National", data_2018$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2018))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2018)
## 'data.frame': 42368 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 10626 173 23 269 111 612 190 108 31 16 ...
## $ Tot_Suplrs : int 14509 346 24 409 194 879 292 155 65 30 ...
## $ Tot_Suplr_Benes : int 26102 591 30 630 353 1202 414 197 113 30 ...
## $ Tot_Suplr_Clms : int 113148 2624 96 2425 1809 4985 1617 711 357 95 ...
## $ Tot_Suplr_Srvcs : int 11902027 290663 9620 249205 167920 461317 146890 90050 57000 13000 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.6 14.6 17.5 16.3 15.3 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.5 10 10.6 10.7 10.4 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 8.12 7.65 8.31 8.3 7.99 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 8.12 7.65 8.31 8.3 7.99 ...
Load 2019 CSV file for Analysis
data_2019 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2019.csv")
str(data_2019)
## 'data.frame': 42198 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 11583 181 29 286 123 653 214 113 33 16 ...
## $ Tot_Suplrs : int 15921 354 28 437 207 986 334 186 77 30 ...
## $ Tot_Suplr_Benes : int 31138 695 35 725 433 1438 500 226 138 27 ...
## $ Tot_Suplr_Clms : int 134823 3109 112 2780 2263 5952 2052 812 421 101 ...
## $ Tot_Suplr_Srvcs : int 14425456 366840 11500 288320 200590 565320 187550 99900 64860 10680 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 16.1 14.7 17.7 16.6 15.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.7 10 10.9 10.9 10.7 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 8.31 7.7 8.55 8.47 8.21 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 8.31 7.7 8.55 8.47 8.21 ...
# Convert empty strings or whitespace to NA
data_2019 <- data_2019 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2019))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1787 33
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4662
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2019$Tot_Suplr_Benes)) / nrow(data_2019) * 100
missing_percentage
## [1] 11.04792
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2019$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2019) * 100
missing_percentage
## [1] 4.234798
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2019$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2019) * 100
missing_percentage
## [1] 0.07820276
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2019 <- data_2019[!is.na(data_2019$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2019 <- data_2019 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2019$Tot_Suplr_Benes <- as.integer(data_2019$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2019$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2019$Rfrg_Prvdr_Geo_Cd), "National", data_2019$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2019))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2019)
## 'data.frame': 42165 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 11583 181 29 286 123 653 214 113 33 16 ...
## $ Tot_Suplrs : int 15921 354 28 437 207 986 334 186 77 30 ...
## $ Tot_Suplr_Benes : int 31138 695 35 725 433 1438 500 226 138 27 ...
## $ Tot_Suplr_Clms : int 134823 3109 112 2780 2263 5952 2052 812 421 101 ...
## $ Tot_Suplr_Srvcs : int 14425456 366840 11500 288320 200590 565320 187550 99900 64860 10680 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 16.1 14.7 17.7 16.6 15.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.7 10 10.9 10.9 10.7 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 8.31 7.7 8.55 8.47 8.21 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 8.31 7.7 8.55 8.47 8.21 ...
Load 2020 CSV file for Analysis
data_2020 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2020.csv")
str(data_2020)
## 'data.frame': 41098 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 11718 194 28 276 120 648 223 125 34 16 ...
## $ Tot_Suplrs : int 16373 365 26 445 224 1035 345 216 72 31 ...
## $ Tot_Suplr_Benes : int 33547 728 36 804 477 1584 540 257 128 32 ...
## $ Tot_Suplr_Clms : int 146868 3289 141 2944 2629 6515 2236 909 435 106 ...
## $ Tot_Suplr_Srvcs : int 16113154 458510 13450 317610 235900 646460 209820 110510 59550 9780 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.5 12.5 14.2 15.8 14.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.13 8.41 10.01 10.27 10.02 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 7.97 6.54 7.94 8.1 7.83 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 7.86 6.46 7.82 7.99 7.72 ...
# Convert empty strings or whitespace to NA
data_2020 <- data_2020 %>%
mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))
# Check for missing values in each column
colSums(is.na(data_2020))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 1723 2
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 4782
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2020$Tot_Suplr_Benes)) / nrow(data_2020) * 100
missing_percentage
## [1] 11.6356
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2020$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2020) * 100
missing_percentage
## [1] 4.192418
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2020$Rfrg_Prvdr_Geo_Desc )) / nrow(data_2020) * 100
missing_percentage
## [1] 0.004866417
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2020 <- data_2020[!is.na(data_2020$Rfrg_Prvdr_Geo_Desc), ]
# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2020 <- data_2020 %>%
mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))
#convert the column back to integer after the imputation
data_2020$Tot_Suplr_Benes <- as.integer(data_2020$Tot_Suplr_Benes)
# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2020$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2020$Rfrg_Prvdr_Geo_Cd), "National", data_2020$Rfrg_Prvdr_Geo_Cd)
# Check for missing values in each updated column
colSums(is.na(data_2020))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
# Check the updated column
str(data_2020)
## 'data.frame': 41096 obs. of 18 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 11718 194 28 276 120 648 223 125 34 16 ...
## $ Tot_Suplrs : int 16373 365 26 445 224 1035 345 216 72 31 ...
## $ Tot_Suplr_Benes : int 33547 728 36 804 477 1584 540 257 128 32 ...
## $ Tot_Suplr_Clms : int 146868 3289 141 2944 2629 6515 2236 909 435 106 ...
## $ Tot_Suplr_Srvcs : int 16113154 458510 13450 317610 235900 646460 209820 110510 59550 9780 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 15.5 12.5 14.2 15.8 14.8 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 10.13 8.41 10.01 10.27 10.02 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 7.97 6.54 7.94 8.1 7.83 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 7.86 6.46 7.82 7.99 7.72 ...
# Add the Year column to each dataset
data_2014$Year <- as.integer(2014)
data_2015$Year <- as.integer(2015)
data_2016$Year <- as.integer(2016)
data_2017$Year <- as.integer(2017)
data_2018$Year <- as.integer(2018)
data_2019$Year <- as.integer(2019)
data_2020$Year <- as.integer(2020)
Combine all the Data
# Combine all data frames into one
combined_data <- rbind(data_2014, data_2015, data_2016, data_2017, data_2018, data_2019, data_2020)
# View the combined data
head(combined_data)
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 1 National National National
## 2 State 01 Alabama
## 3 State 02 Alaska
## 4 State 04 Arizona
## 5 State 05 Arkansas
## 6 State 06 California
## RBCS_Lvl RBCS_Id RBCS_Desc
## 1 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## 2 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## 3 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## 4 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## 5 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## 6 Drugs Administered Through DME DG000N DME-Drugs Administered Through DME
## HCPCS_Cd
## 1 J1817
## 2 J1817
## 3 J1817
## 4 J1817
## 5 J1817
## 6 J1817
## HCPCS_Desc
## 1 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 2 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 3 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 4 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 5 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 6 Insulin for administration through dme (i.e., insulin pump) per 50 units
## Suplr_Rentl_Ind Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms
## 1 N 6830 8928 12351 54724
## 2 N 128 222 305 1332
## 3 N 8 8 86 27
## 4 N 168 246 293 1151
## 5 N 85 107 177 843
## 6 N 352 447 504 2246
## Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
## 1 5258684 10.76232 2.783783
## 2 138847 10.25206 2.796111
## 3 2160 10.69145 2.800000
## 4 106080 10.87012 2.800000
## 5 77090 10.45740 2.791503
## 6 184666 10.79762 2.799330
## Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt Year
## 1 2.131986 2.131986 2014
## 2 2.067193 2.067193 2014
## 3 2.164935 2.164935 2014
## 4 2.165747 2.165747 2014
## 5 2.160118 2.160118 2014
## 6 2.152167 2.152167 2014
# Write the combined data to a new CSV file
#write.csv(combined_data, "combined_data.csv", row.names = FALSE)
# View the structure of the data
str(combined_data)
## 'data.frame': 295052 obs. of 19 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "National" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "National" "01" "02" "04" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "National" "Alabama" "Alaska" "Arizona" ...
## $ RBCS_Lvl : chr "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
## $ RBCS_Id : chr "DG000N" "DG000N" "DG000N" "DG000N" ...
## $ RBCS_Desc : chr "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
## $ HCPCS_Cd : chr "J1817" "J1817" "J1817" "J1817" ...
## $ HCPCS_Desc : chr "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 6830 128 8 168 85 352 100 59 23 5 ...
## $ Tot_Suplrs : int 8928 222 8 246 107 447 145 91 33 7 ...
## $ Tot_Suplr_Benes : int 12351 305 86 293 177 504 163 97 43 86 ...
## $ Tot_Suplr_Clms : int 54724 1332 27 1151 843 2246 749 355 156 19 ...
## $ Tot_Suplr_Srvcs : int 5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 10.8 10.3 10.7 10.9 10.5 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 2.78 2.8 2.8 2.8 2.79 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2.13 2.07 2.16 2.17 2.16 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2.13 2.07 2.16 2.17 2.16 ...
## $ Year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# Check for missing values in combined_data column
colSums(is.na(combined_data))
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 0 0 0
## RBCS_Lvl RBCS_Id RBCS_Desc
## 0 0 0
## HCPCS_Cd HCPCS_Desc Suplr_Rentl_Ind
## 0 0 0
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 0 0 0
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 0 0 0
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 0 0 0
## Year
## 0
# Find integer columns ( discrete)
int_cols <- sapply(combined_data, is.integer)
discrete_vars <- names(combined_data)[int_cols]
print("Discrete Variables (Integers):")
## [1] "Discrete Variables (Integers):"
print(discrete_vars)
## [1] "Tot_Rfrg_Prvdrs" "Tot_Suplrs" "Tot_Suplr_Benes" "Tot_Suplr_Clms"
## [5] "Tot_Suplr_Srvcs" "Year"
# Identify continuous variables (numeric but not integers)
num_cols <- sapply(combined_data, is.numeric)
continuous_vars <- names(combined_data)[num_cols & !int_cols]
print("Continuous Variables (Numeric):")
## [1] "Continuous Variables (Numeric):"
print(continuous_vars)
## [1] "Avg_Suplr_Sbmtd_Chrg" "Avg_Suplr_Mdcr_Alowd_Amt"
## [3] "Avg_Suplr_Mdcr_Pymt_Amt" "Avg_Suplr_Mdcr_Stdzd_Amt"
# Identify boolean columns (binary values)
boolean_columns <- sapply(combined_data, function(x) is.character(x) && length(unique(x)) == 2)
print(names(combined_data)[boolean_columns])
## [1] "Rfrg_Prvdr_Geo_Lvl" "Suplr_Rentl_Ind"
# Identify nominal columns (categorical & more than 2 unique value)
nominal_columns <- sapply(combined_data, function(x) is.character(x) && length(unique(x)) > 2)
print(names(combined_data)[nominal_columns])
## [1] "Rfrg_Prvdr_Geo_Cd" "Rfrg_Prvdr_Geo_Desc" "RBCS_Lvl"
## [4] "RBCS_Id" "RBCS_Desc" "HCPCS_Cd"
## [7] "HCPCS_Desc"
Duplicate Analysis
# Check for duplicate rows in the combined_data
duplicate_rows <- sum(duplicated(combined_data))
# Print the number of duplicate rows
print(paste("Number of duplicate rows: ", duplicate_rows))
## [1] "Number of duplicate rows: 0"
Normalization
# Loop through each continuous variable and plot histograms
for (col_name in continuous_vars) {
# Extract the data for the current column
column_data <- combined_data[[col_name]]
# Plot histogram with a normal curve using plotNormalHistogram()
plotNormalHistogram(column_data,
main = paste("Histogram of", col_name),
xlab = col_name)
}




# Subsample 5000 rows from each continuous column
subsample_Avg_Suplr_Sbmtd_Chrg <- sample(combined_data[["Avg_Suplr_Sbmtd_Chrg"]], 5000)
# Apply the Shapiro-Wilk test
print(shapiro.test(subsample_Avg_Suplr_Sbmtd_Chrg))
##
## Shapiro-Wilk normality test
##
## data: subsample_Avg_Suplr_Sbmtd_Chrg
## W = 0.30569, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Alowd_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Alowd_Amt"]], 5000)
print(shapiro.test(subsample_Avg_Suplr_Mdcr_Alowd_Amt))
##
## Shapiro-Wilk normality test
##
## data: subsample_Avg_Suplr_Mdcr_Alowd_Amt
## W = 0.24844, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Pymt_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Pymt_Amt"]], 5000)
print( shapiro.test(subsample_Avg_Suplr_Mdcr_Pymt_Amt))
##
## Shapiro-Wilk normality test
##
## data: subsample_Avg_Suplr_Mdcr_Pymt_Amt
## W = 0.24635, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Stdzd_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Stdzd_Amt"]], 5000)
print( shapiro.test(subsample_Avg_Suplr_Mdcr_Stdzd_Amt))
##
## Shapiro-Wilk normality test
##
## data: subsample_Avg_Suplr_Mdcr_Stdzd_Amt
## W = 0.2375, p-value < 2.2e-16
# Scale the continuous variables without modifying the original dataset
scaled_data <- scale(combined_data[continuous_vars])
# View the scaled dataset (just the scaled variables)
head(scaled_data)
## Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt
## 1 -0.2985382 -0.2846836 -0.2841732
## 2 -0.2988250 -0.2846732 -0.2842437
## 3 -0.2985781 -0.2846699 -0.2841374
## 4 -0.2984776 -0.2846699 -0.2841365
## 5 -0.2987096 -0.2846771 -0.2841426
## 6 -0.2985184 -0.2846705 -0.2841513
## Avg_Suplr_Mdcr_Stdzd_Amt
## 1 -0.2853697
## 2 -0.2854403
## 3 -0.2853339
## 4 -0.2853330
## 5 -0.2853391
## 6 -0.2853478
# Loop through each numeric column in the scaled data and plot a histogram
for (i in 1:ncol(scaled_data)) {
# Plot histogram with a normal curve
plotNormalHistogram(scaled_data,
main = paste("Histogram of scaled_data", col_name),
xlab = col_name)
}

# Loop through each continuous variable and apply square root transformation
for (col_name in continuous_vars) {
# Apply square root transformation
transformed_data <- sqrt(combined_data[[col_name]])
# Plot histogram with a normal curve
plotNormalHistogram(transformed_data,
main = paste("Histogram of T_Square", col_name),
xlab = col_name)
}




# Apply cube root transformation and plot histograms
for (col_name in continuous_vars) {
# Apply cube root transformation
T_cub <- sign(combined_data[[col_name]]) * abs(combined_data[[col_name]])^(1/3)
# plot histograms with normal distribution overlay
plotNormalHistogram(T_cub,
main = paste("Histogram of T_Cube", col_name),
xlab = col_name)
}




# Loop through each continuous variable, apply log transformation, and plot histograms
for (col_name in continuous_vars) {
# Apply log transformation with a small constant added
T_log <- log(combined_data[[col_name]] + 1) # Add 1 to avoid log(0) errors
# plot histograms with normal distribution overlay
plotNormalHistogram(T_log,
main = paste("Histogram of T_log", col_name),
xlab = col_name)
# QQ Plot for log-transformed variable
qqnorm(T_log,
ylab = paste("Sample Quantiles for", col_name),
main = paste("QQ Plot ", col_name))
# Add QQ line
qqline(T_log,
col = "red")
}








# Randomly sample 5000 rows from the dataset
subsample <- sample(T_log, 5000)
# Apply the Shapiro-Wilk test to the subsample
shapiro.test(subsample)
##
## Shapiro-Wilk normality test
##
## data: subsample
## W = 0.98642, p-value < 2.2e-16
# Calculate Spearman's Rank Correlation for continuous variables
spearman_correlation_matrix <- cor(combined_data[continuous_vars], method = "spearman")
# View the Spearman correlation matrix
print(spearman_correlation_matrix)
## Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
## Avg_Suplr_Sbmtd_Chrg 1.0000000 0.9648672
## Avg_Suplr_Mdcr_Alowd_Amt 0.9648672 1.0000000
## Avg_Suplr_Mdcr_Pymt_Amt 0.9643239 0.9997474
## Avg_Suplr_Mdcr_Stdzd_Amt 0.9597192 0.9974304
## Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## Avg_Suplr_Sbmtd_Chrg 0.9643239 0.9597192
## Avg_Suplr_Mdcr_Alowd_Amt 0.9997474 0.9974304
## Avg_Suplr_Mdcr_Pymt_Amt 1.0000000 0.9976300
## Avg_Suplr_Mdcr_Stdzd_Amt 0.9976300 1.0000000
# Loop through each discrete variable and plot histograms
for (col_name in discrete_vars) {
# Extract the data for the current column
column_data <- combined_data[[col_name]]
# Plot histogram for the discrete variable with normal distribution overlay
plotNormalHistogram(column_data,
main = paste("Histogram of Discrete", col_name),
xlab = col_name)
}






# Loop through each discrete variable and create a bar plot
for (col_name in discrete_vars) {
# Create the bar plot
barplot(table(combined_data[[col_name]]),
main = paste("Distribution of", col_name),
xlab = col_name,
ylab = "Frequency",
col = "red")
}






# Calculate Spearman's Rank Correlation for discrete variables
spearman_correlation_matrix <- cor(combined_data[discrete_vars], method = "spearman")
# View the Spearman correlation matrix
print(spearman_correlation_matrix)
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms
## Tot_Rfrg_Prvdrs 1.000000000 0.89837491 0.80542044 0.89775162
## Tot_Suplrs 0.898374912 1.00000000 0.73070515 0.81571093
## Tot_Suplr_Benes 0.805420442 0.73070515 1.00000000 0.81571183
## Tot_Suplr_Clms 0.897751625 0.81571093 0.81571183 1.00000000
## Tot_Suplr_Srvcs 0.665880706 0.61512445 0.65662987 0.83173216
## Year -0.004721095 -0.01391891 0.01057104 0.01227504
## Tot_Suplr_Srvcs Year
## Tot_Rfrg_Prvdrs 0.66588071 -0.004721095
## Tot_Suplrs 0.61512445 -0.013918909
## Tot_Suplr_Benes 0.65662987 0.010571036
## Tot_Suplr_Clms 0.83173216 0.012275038
## Tot_Suplr_Srvcs 1.00000000 0.017659445
## Year 0.01765944 1.000000000
# Filter out only numeric columns from combined_data
numeric_data <- combined_data[, sapply(combined_data, is.numeric)]
# Calculate the Spearman correlation matrix for numeric columns
spearman_cor <- cor(numeric_data, method = "spearman", use = "complete.obs")
# Generate a heat map for the Spearman correlation matrix
corrplot(spearman_cor, method = "color", type = "ful",tl.cex = 0.5, addCoef.col = "black")

# Barplot for binary categorical variable
# Suplr_Rentl_Ind
barplot(table(combined_data$Suplr_Rentl_Ind), main = "Barplot of Suplr_Rentl_Ind",
xlab = "Supplier Rental Indicator", col = "orange")

#Rfrg_Prvdr_Geo_Lvl
barplot(table(combined_data$Rfrg_Prvdr_Geo_Lvl), main = "Barplot of Rfrg_Prvdr_Geo_Lvl",
xlab = "Referring Provider Geography Level ", col = "blue")

EDA analysis
# Install package
#install.packages("dataMaid")
# Import library
#library(dataMaid)
# Create report
#makeDataReport(combined_data, output = "html", replace = TRUE)
Defining underserved regions
# Calculating percentiles for high charges and low service volumes
charge_threshold_high <- quantile(combined_data$Avg_Suplr_Sbmtd_Chrg, 0.95, na.rm = TRUE) # 90th percentile for charges
service_threshold_low <- quantile(combined_data$Tot_Suplr_Srvcs, 0.05, na.rm = TRUE) # 10th percentile for services
# Flagging unusually high charges (above 95th percentile)
combined_data <- combined_data %>%
mutate(high_charges = ifelse(Avg_Suplr_Sbmtd_Chrg >= charge_threshold_high, TRUE, FALSE))
# Flagging low service volumes (below 5th percentile)
combined_data <- combined_data %>%
mutate(low_service_volumes = ifelse(Tot_Suplr_Srvcs <= service_threshold_low, TRUE, FALSE))
# Defining underserved regions where both conditions are met
combined_data <- combined_data %>%
mutate(is_underserved = ifelse(high_charges == TRUE & low_service_volumes == TRUE, TRUE, FALSE))
# Summary statistics for underserved regions
underserved_regions <- combined_data %>% filter(is_underserved == TRUE)
summary(underserved_regions)
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc RBCS_Lvl
## Length:2068 Length:2068 Length:2068 Length:2068
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## RBCS_Id RBCS_Desc HCPCS_Cd HCPCS_Desc
## Length:2068 Length:2068 Length:2068 Length:2068
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Suplr_Rentl_Ind Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## Length:2068 Min. : 1.00 Min. : 1.000 Min. :11.00
## Class :character 1st Qu.:10.00 1st Qu.: 4.000 1st Qu.:12.00
## Mode :character Median :12.00 Median : 7.000 Median :13.00
## Mean :11.35 Mean : 6.676 Mean :18.85
## 3rd Qu.:13.00 3rd Qu.: 9.000 3rd Qu.:15.00
## Max. :16.00 Max. :16.000 Max. :92.00
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
## Min. :11.00 Min. :11.00 Min. : 2425 Min. : 82.3
## 1st Qu.:12.00 1st Qu.:12.00 1st Qu.: 3282 1st Qu.: 2237.6
## Median :13.00 Median :13.00 Median : 4835 Median : 3285.3
## Mean :13.11 Mean :13.35 Mean : 6989 Mean : 4433.8
## 3rd Qu.:14.00 3rd Qu.:15.00 3rd Qu.: 7887 3rd Qu.: 4946.6
## Max. :16.00 Max. :16.00 Max. :70913 Max. :39266.0
## Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt Year high_charges
## Min. : 60.18 Min. : 58.22 Min. :2014 Mode:logical
## 1st Qu.: 1730.06 1st Qu.: 1714.62 1st Qu.:2015 TRUE:2068
## Median : 2540.24 Median : 2526.78 Median :2017
## Mean : 3442.03 Mean : 3431.76 Mean :2017
## 3rd Qu.: 3848.31 3rd Qu.: 3852.76 3rd Qu.:2019
## Max. :31071.56 Max. :31019.03 Max. :2020
## low_service_volumes is_underserved
## Mode:logical Mode:logical
## TRUE:2068 TRUE:2068
##
##
##
##
str(underserved_regions)
## 'data.frame': 2068 obs. of 22 variables:
## $ Rfrg_Prvdr_Geo_Lvl : chr "State" "State" "State" "State" ...
## $ Rfrg_Prvdr_Geo_Cd : chr "02" "10" "35" "50" ...
## $ Rfrg_Prvdr_Geo_Desc : chr "Alaska" "Delaware" "New Mexico" "Vermont" ...
## $ RBCS_Lvl : chr "Durable Medical Equipment" "Durable Medical Equipment" "Durable Medical Equipment" "Durable Medical Equipment" ...
## $ RBCS_Id : chr "DD000N" "DD000N" "DD000N" "DD000N" ...
## $ RBCS_Desc : chr "DME-Wheelchairs" "DME-Wheelchairs" "DME-Wheelchairs" "DME-Wheelchairs" ...
## $ HCPCS_Cd : chr "E1002" "E1002" "E1002" "E1002" ...
## $ HCPCS_Desc : chr "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" ...
## $ Suplr_Rentl_Ind : chr "N" "N" "N" "N" ...
## $ Tot_Rfrg_Prvdrs : int 12 15 12 15 3 4 5 12 14 14 ...
## $ Tot_Suplrs : int 3 4 5 6 3 3 3 10 12 5 ...
## $ Tot_Suplr_Benes : int 12 15 12 15 86 86 86 12 14 15 ...
## $ Tot_Suplr_Clms : int 12 15 12 15 11 13 14 12 14 15 ...
## $ Tot_Suplr_Srvcs : int 12 15 12 15 11 13 14 12 14 15 ...
## $ Avg_Suplr_Sbmtd_Chrg : num 5252 6610 6727 5590 8442 ...
## $ Avg_Suplr_Mdcr_Alowd_Amt: num 3820 3818 3789 3804 330 ...
## $ Avg_Suplr_Mdcr_Pymt_Amt : num 2995 2993 2971 2982 259 ...
## $ Avg_Suplr_Mdcr_Stdzd_Amt: num 2995 2993 2971 2982 259 ...
## $ Year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ high_charges : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ low_service_volumes : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ is_underserved : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
# View underserved regions
head(underserved_regions)
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 7208 State 02 Alaska
## 7214 State 10 Delaware
## 7237 State 35 New Mexico
## 7251 State 50 Vermont
## 7261 State 19 Iowa
## 7264 State 27 Minnesota
## RBCS_Lvl RBCS_Id RBCS_Desc HCPCS_Cd
## 7208 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## 7214 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## 7237 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## 7251 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## 7261 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## 7264 Durable Medical Equipment DD000N DME-Wheelchairs E1002
## HCPCS_Desc Suplr_Rentl_Ind
## 7208 Wheelchair accessory, power seating system, tilt only N
## 7214 Wheelchair accessory, power seating system, tilt only N
## 7237 Wheelchair accessory, power seating system, tilt only N
## 7251 Wheelchair accessory, power seating system, tilt only N
## 7261 Wheelchair accessory, power seating system, tilt only Y
## 7264 Wheelchair accessory, power seating system, tilt only Y
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms Tot_Suplr_Srvcs
## 7208 12 3 12 12 12
## 7214 15 4 15 15 15
## 7237 12 5 12 12 12
## 7251 15 6 15 15 15
## 7261 3 3 86 11 11
## 7264 4 3 86 13 13
## Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt
## 7208 5252.083 3820.1150 2994.9717
## 7214 6609.733 3817.6147 2993.0113
## 7237 6726.528 3789.0050 2970.5817
## 7251 5590.318 3804.1313 2982.4393
## 7261 8441.909 329.9191 258.6618
## 7264 3596.281 328.6085 257.6338
## Avg_Suplr_Mdcr_Stdzd_Amt Year high_charges low_service_volumes
## 7208 2994.9717 2014 TRUE TRUE
## 7214 2993.0113 2014 TRUE TRUE
## 7237 2970.5817 2014 TRUE TRUE
## 7251 2982.4393 2014 TRUE TRUE
## 7261 258.6618 2014 TRUE TRUE
## 7264 270.7031 2014 TRUE TRUE
## is_underserved
## 7208 TRUE
## 7214 TRUE
## 7237 TRUE
## 7251 TRUE
## 7261 TRUE
## 7264 TRUE
# Filter the underserved regions
underserved_regions <- combined_data %>% filter(is_underserved == TRUE)
# View the first few rows with geographic data and year
head(underserved_regions %>% select(Year, Rfrg_Prvdr_Geo_Desc, Rfrg_Prvdr_Geo_Cd, Tot_Suplr_Srvcs, Avg_Suplr_Sbmtd_Chrg))
## Year Rfrg_Prvdr_Geo_Desc Rfrg_Prvdr_Geo_Cd Tot_Suplr_Srvcs
## 7208 2014 Alaska 02 12
## 7214 2014 Delaware 10 15
## 7237 2014 New Mexico 35 12
## 7251 2014 Vermont 50 15
## 7261 2014 Iowa 19 11
## 7264 2014 Minnesota 27 13
## Avg_Suplr_Sbmtd_Chrg
## 7208 5252.083
## 7214 6609.733
## 7237 6726.528
## 7251 5590.318
## 7261 8441.909
## 7264 3596.281
# Group underserved regions by year and geographic description
underserved_by_year_geo <- underserved_regions %>%
group_by(Year, Rfrg_Prvdr_Geo_Desc) %>%
summarise(Total_Underserved = n(),
Avg_Supplier_Services = mean(Tot_Suplr_Srvcs, na.rm = TRUE),
Avg_Supplier_Charges = mean(Avg_Suplr_Sbmtd_Chrg, na.rm = TRUE)) %>%
arrange(Year, desc(Total_Underserved))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# View the underserved regions by year and geography
head(underserved_by_year_geo)
## # A tibble: 6 × 5
## # Groups: Year [1]
## Year Rfrg_Prvdr_Geo_Desc Total_Underserved Avg_Supplier_Services
## <int> <chr> <int> <dbl>
## 1 2014 National 17 13.3
## 2 2014 Florida 10 13
## 3 2014 Nevada 10 13
## 4 2014 Georgia 8 14
## 5 2014 New Mexico 8 13.4
## 6 2014 Oregon 8 14.1
## # ℹ 1 more variable: Avg_Supplier_Charges <dbl>
# Select the top 5 underserved regions for each year using the filtered dataset
top_underserved_by_year <- underserved_by_year_geo %>%
group_by(Year) %>%
slice_max(order_by = Total_Underserved, n = 5) %>%
ungroup()
# Plot the top 5 underserved regions by year and geographic area
ggplot(top_underserved_by_year,
aes(x=reorder(Rfrg_Prvdr_Geo_Desc, -Total_Underserved), y=Total_Underserved, fill=factor(Year))) +
geom_bar(stat="identity") +
coord_flip() + # Flip the coordinates to make the bar chart horizontal
facet_wrap(~ Year, scales = "free_y") +
labs(title="Top 5 Underserved Regions by Year", x="Geographic Region", y="Total Underserved") +
theme_minimal()

# Plot trend of underserved regions over time
ggplot(underserved_by_year_geo, aes(x=Year, y=Total_Underserved, color=Rfrg_Prvdr_Geo_Desc)) +
geom_line() +
labs(title="Trend of Underserved Regions Over Time", x="Year", y="Total Underserved") +
theme_minimal()

write.csv(combined_data, "combined_data.csv", row.names = FALSE)
# View a summary of the combined_data
summary(combined_data)
## Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc RBCS_Lvl
## Length:295052 Length:295052 Length:295052 Length:295052
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## RBCS_Id RBCS_Desc HCPCS_Cd HCPCS_Desc
## Length:295052 Length:295052 Length:295052 Length:295052
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Suplr_Rentl_Ind Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## Length:295052 Min. : 1.0 Min. : 1.00 Min. : 11
## Class :character 1st Qu.: 16.0 1st Qu.: 7.00 1st Qu.: 33
## Mode :character Median : 47.0 Median : 17.00 Median : 89
## Mean : 478.6 Mean : 94.38 Mean : 1669
## 3rd Qu.: 180.0 3rd Qu.: 49.00 3rd Qu.: 295
## Max. :271965.0 Max. :49095.00 Max. :3495668
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## Min. : 11 Min. : 11 Min. : 0.01
## 1st Qu.: 33 1st Qu.: 54 1st Qu.: 19.36
## Median : 121 Median : 305 Median : 92.21
## Mean : 4371 Mean : 83082 Mean : 541.91
## 3rd Qu.: 600 3rd Qu.: 3015 3rd Qu.: 393.33
## Max. :10498977 Max. :608845461 Max. :70913.34
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## Min. : 0.01 Min. : 0.000 Min. : 0.000
## 1st Qu.: 9.49 1st Qu.: 7.157 1st Qu.: 7.377
## Median : 48.73 Median : 36.761 Median : 37.475
## Mean : 340.29 Mean : 263.389 Mean : 264.212
## 3rd Qu.: 206.05 3rd Qu.: 159.055 3rd Qu.: 162.317
## Max. :39757.35 Max. :31071.560 Max. :31019.030
## Year high_charges low_service_volumes is_underserved
## Min. :2014 Mode :logical Mode :logical Mode :logical
## 1st Qu.:2015 FALSE:280299 FALSE:277781 FALSE:292984
## Median :2017 TRUE :14753 TRUE :17271 TRUE :2068
## Mean :2017
## 3rd Qu.:2019
## Max. :2020
# Step 1: Identify numeric columns
num_cols <- combined_data[sapply(combined_data, is.numeric)]
# Step 2: Calculate standard deviation for each numeric column
sd_values <- sapply(num_cols, sd)
# Print the standard deviation values
print(sd_values)
## Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes
## 4.182212e+03 6.860545e+02 2.582794e+04
## Tot_Suplr_Clms Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg
## 7.647500e+04 2.317080e+06 1.779177e+03
## Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## 1.185538e+03 9.193592e+02 9.183867e+02
## Year
## 1.997253e+00